/*-------------------------------------------------------------------------
 *
 * chparser.cpp
 *	  a text search parser for Chinese
 *
 *-------------------------------------------------------------------------
 */
#include "chparser.h"

#include "postgres.h"
#include "miscadmin.h"
#include "fmgr.h"
#include "utils/guc.h"
#include "utils/builtins.h"

#if PG_VERSION_NUM >= 100000
#include "utils/varlena.h"
#endif

#include "commands/dbcommands.h"
#include "commands/extension.h"
#include "knl/knl_session.h"

/* dict file extension */
#define TXT_EXT ".txt"
#define XDB_EXT ".xdb"
/* length of file extension */
#define EXT_LEN 4

PG_MODULE_MAGIC;
/*
 * types
 */

#if PG_VERSION_NUM < 150000
#define MarkGUCPrefixReserved(x) EmitWarningsOnPlaceholders(x)
#endif

/* self-defined type */
typedef struct {
    char	   *buffer;			/* text to parse */
    int		len;			/* length of the text in buffer */
    int		pos;			/* position of the parser */
    scws_t scws;
    scws_res_t head;
    scws_res_t curr;
} ParserState;

/* copy-paste from wparser.h of tsearch2 */
typedef struct {
    int			lexid;
    char	   *alias;
    char	   *descr;
} LexDescr;

static void init();

static void init_type(LexDescr descr[]);

/*
 * prototypes
 */
PG_FUNCTION_INFO_V1(chprs_start);
extern "C" Datum		chprs_start(PG_FUNCTION_ARGS);

PG_FUNCTION_INFO_V1(chprs_getlexeme);
extern "C" Datum		chprs_getlexeme(PG_FUNCTION_ARGS);

PG_FUNCTION_INFO_V1(chprs_end);
extern "C" Datum		chprs_end(PG_FUNCTION_ARGS);

PG_FUNCTION_INFO_V1(chprs_lextype);
extern "C" Datum		chprs_lextype(PG_FUNCTION_ARGS);

uint32 chparser_index;

extern "C" void init_session_vars(void);

typedef struct chparser_session_context {
    scws_t scws;
    ParserState parser_state;
    bool dict_in_memory;
    char *extra_dicts;
    bool punctuation_ignore;
    bool seg_with_duality;
    bool multi_short;
    bool multi_duality;
    bool multi_zmain;
    bool multi_zall;
} chparser_session_context;

extern "C" void set_extension_index(uint32 index)
{
    chparser_index = index;
}

chparser_session_context* get_session_context()
{
    if (u_sess->attr.attr_common.extension_session_vars_array[chparser_index] == NULL) {
        init_session_vars();
    }
    return (chparser_session_context*)u_sess->attr.attr_common.extension_session_vars_array[chparser_index];
}

void init_session_vars(void)
{
    RepallocSessionVarsArrayIfNecessary();
    chparser_session_context* ctx = (chparser_session_context*)MemoryContextAllocZero(u_sess->self_mem_cxt,
                                                                                      sizeof(chparser_session_context));
    u_sess->attr.attr_common.extension_session_vars_array[chparser_index] = ctx;

    ctx->scws = NULL;
    ctx->dict_in_memory = false;
    ctx->extra_dicts = NULL;
    ctx->punctuation_ignore = false;
    ctx->seg_with_duality = false;
    ctx->multi_short = false;
    ctx->multi_duality = false;
    ctx->multi_zmain = false;
    ctx->multi_zall = false;

    DefineCustomBoolVariable("chparser.dict_in_memory", "load dicts into memory", "load dicts into memory",
                             &(get_session_context()->dict_in_memory), false, PGC_BACKEND, 0, NULL, NULL, NULL);

    DefineCustomStringVariable("chparser.extra_dicts", "extra dicts files to load", "extra dicts files to load",
                               &(get_session_context()->extra_dicts), NULL, PGC_BACKEND, 0, NULL, NULL, NULL);

    DefineCustomBoolVariable("chparser.punctuation_ignore", "set if chparser ignores the puncuation",
                             "set if chparser ignores the puncuation,except \\r and \\n",
                             &(get_session_context()->punctuation_ignore), false, PGC_USERSET, 0, NULL, NULL, NULL);

    DefineCustomBoolVariable("chparser.seg_with_duality", "segment words with duality", "segment words with duality",
                             &(get_session_context()->seg_with_duality), false, PGC_USERSET, 0, NULL, NULL, NULL);

    DefineCustomBoolVariable("chparser.multi_short", "prefer short words", "prefer short words",
                             &(get_session_context()->multi_short), false, PGC_USERSET, 0, NULL, NULL, NULL);

    DefineCustomBoolVariable("chparser.multi_duality", "prefer duality", "prefer duality",
                             &(get_session_context()->multi_duality), false, PGC_USERSET, 0, NULL, NULL, NULL);

    DefineCustomBoolVariable("chparser.multi_zmain", "prefer most important element", "prefer most important element",
                             &(get_session_context()->multi_zmain), false, PGC_USERSET, 0, NULL, NULL, NULL);

    DefineCustomBoolVariable("chparser.multi_zall", "prefer all element", "prefer all element",
                             &(get_session_context()->multi_zall), false, PGC_USERSET, 0, NULL, NULL, NULL);

    MarkGUCPrefixReserved("chparser");
}

static void init()
{
    char sharepath[MAXPGPATH];
    char dict_path[MAXPGPATH];
    char rule_path[MAXPGPATH];
    int load_dict_mem_mode = 0x0;

    List *elemlist;
    ListCell *l;

    if (!(get_session_context()->scws = scws_new())) {
        ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Failed to init Chinese Parser Lib SCWS!\"%s\"", "")));
    }

    get_share_path(my_exec_path, sharepath);
    snprintf(dict_path, MAXPGPATH, "%s/tsearch_data/%s.%s",
             sharepath, "dict.utf8", "xdb");
    scws_set_charset(get_session_context()->scws, "utf-8");

    if (get_session_context()->dict_in_memory)
        load_dict_mem_mode = SCWS_XDICT_MEM;

    /* ignore error,default dict is xdb */
    if (scws_set_dict(get_session_context()->scws, dict_path, load_dict_mem_mode | SCWS_XDICT_XDB) != 0) {
        ereport(NOTICE,
            (errcode(ERRCODE_INTERNAL_ERROR), errmsg("chparser set dict : \"%s\" failed!", dict_path)));
    }

    snprintf(dict_path, MAXPGPATH, "%s/base/chprs_dict_%s.txt",
             t_thrd.proc_cxt.DataDir, get_database_name(u_sess->proc_cxt.MyDatabaseId));
    if (scws_add_dict(get_session_context()->scws, dict_path, load_dict_mem_mode | SCWS_XDICT_TXT) != 0) {
        ereport(NOTICE, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("chparser add dict : \"%s\" failed!", dict_path)));
    }

    if (get_session_context()->extra_dicts != NULL) {
        if (!SplitIdentifierString(get_session_context()->extra_dicts, ',', &elemlist)) {
            scws_free(get_session_context()->scws);
            list_free(elemlist);
            get_session_context()->scws = NULL;
            ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR),
                    errmsg("chparser.extra_dicts syntax error! extra_dicts is \"%s\"",
                           get_session_context()->extra_dicts
                           )));
        }

        foreach(l, elemlist) {
            int load_dict_mode = load_dict_mem_mode;
            char *ext = strrchr((char*)lfirst(l), '.');
            if (ext != NULL && strlen(ext) == EXT_LEN) {
                if (strncmp(ext, TXT_EXT, EXT_LEN) == 0) {
                    load_dict_mode |= SCWS_XDICT_TXT;
                } else if (strncmp(ext, XDB_EXT, EXT_LEN) == 0) {
                    load_dict_mode |= SCWS_XDICT_XDB;
                }
            }

            if (((load_dict_mode & SCWS_XDICT_TXT) == 0) &&
                ((load_dict_mode & SCWS_XDICT_XDB) == 0)) {
                scws_free(get_session_context()->scws);
                list_free(elemlist);
                get_session_context()->scws = NULL;
                ereport(ERROR,
                    (errcode(ERRCODE_INTERNAL_ERROR),
                     errmsg("chparser.extra_dicts setting error,the file name must end with .txt or .xdb! "
                            "error file name is \"%s\"", (char*)lfirst(l)
                )));
            }

            snprintf(dict_path, MAXPGPATH, "%s/tsearch_data/%s", sharepath, (char*)lfirst(l));
            /* ignore error*/
            if (scws_add_dict(get_session_context()->scws, dict_path, load_dict_mode) != 0) {
                ereport(NOTICE, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("chparser add dict : \"%s\" failed!",
                                                                         dict_path)));
            }
        }
        list_free(elemlist);
    }

    snprintf(rule_path, MAXPGPATH, "%s/tsearch_data/%s.%s", sharepath, "rules.utf8", "ini");
    scws_set_rule(get_session_context()->scws, rule_path);
}

/*
 * functions
 */

Datum chprs_start(PG_FUNCTION_ARGS)
{
    ParserState *pst = &(get_session_context()->parser_state);
    int multi_mode = 0x0;

    if (get_session_context()->scws == NULL) {
        init();
    }
    pst -> scws = get_session_context()->scws;
    pst -> buffer = (char *) PG_GETARG_POINTER(0);
    pst -> len = PG_GETARG_INT32(1);
    pst -> pos = 0;

    scws_set_ignore(get_session_context()->scws, (int)get_session_context()->punctuation_ignore);
    scws_set_duality(get_session_context()->scws, (int)get_session_context()->seg_with_duality);

    if (get_session_context()->multi_short) {
        multi_mode |= SCWS_MULTI_SHORT;
    }

    if (get_session_context()->multi_duality) {
        multi_mode |= SCWS_MULTI_DUALITY;
    }

    if (get_session_context()->multi_zmain) {
        multi_mode |= SCWS_MULTI_ZMAIN;
    }

    if (get_session_context()->multi_zall) {
        multi_mode |= SCWS_MULTI_ZALL;
    }

    scws_set_multi(get_session_context()->scws, multi_mode);

    scws_send_text(pst -> scws, pst -> buffer, pst -> len);

    (pst -> head) = (pst -> curr) = scws_get_result(pst -> scws);

    PG_RETURN_POINTER(pst);
}

Datum chprs_getlexeme(PG_FUNCTION_ARGS)
{
    ParserState *pst = (ParserState *) PG_GETARG_POINTER(0);
    char	  **t = (char **) PG_GETARG_POINTER(1);
    int		   *tlen = (int *) PG_GETARG_POINTER(2);
    int			type = -1;

    if ((pst -> head) == NULL) { /* already done the work,or no sentence */
        *tlen = 0;
        type = 0;
    } else if (pst -> curr != NULL) { /* have results */
        scws_res_t  curr = pst -> curr;

        /*
         * check the first char to determine the lextype
         * if out of [0,25], then set to 'x',mean unknown type
         * so for Ag,Dg,Ng,Tg,Vg, the type will be unknown
         * for full attr explanation, visit http://www.xunsearch.com/scws/docs.php#attr
        */
        type = (int)(curr -> attr)[0];
        if (type > (int)'x' || type < (int)'a') {
            type = (int)'x';
        }
        *tlen = curr -> len;
        *t = pst -> buffer + curr -> off;

        pst -> curr = curr -> next;

        /* fetch the next sentence */
        if (pst -> curr == NULL) {
            scws_free_result(pst -> head);
            (pst -> head) =	(pst -> curr) = scws_get_result(pst -> scws);
        }
    }

    PG_RETURN_INT32(type);
}

Datum chprs_end(PG_FUNCTION_ARGS)
{
    PG_RETURN_VOID();
}

Datum chprs_lextype(PG_FUNCTION_ARGS)
{
    LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (26 + 1));
    init_type(descr);

    PG_RETURN_POINTER(descr);
}

static void init_type(LexDescr descr[])
{
    /*
    * there are 26 types in this parser,alias from a to z
    * for full attr explanation,visit http://www.xunsearch.com/scws/docs.php#attr
    */
    descr[0].lexid = 97;
    descr[0].alias = pstrdup("a");
    descr[0].descr = pstrdup("adjective,形容词");
    descr[1].lexid = 98;
    descr[1].alias = pstrdup("b");
    descr[1].descr = pstrdup("differentiation,区别词");
    descr[2].lexid = 99;
    descr[2].alias = pstrdup("c");
    descr[2].descr = pstrdup("conjunction,连词");
    descr[3].lexid = 100;
    descr[3].alias = pstrdup("d");
    descr[3].descr = pstrdup("adverb,副词");
    descr[4].lexid = 101;
    descr[4].alias = pstrdup("e");
    descr[4].descr = pstrdup("exclamation,感叹词");
    descr[5].lexid = 102;
    descr[5].alias = pstrdup("f");
    descr[5].descr = pstrdup("position,方位词");
    descr[6].lexid = 103;
    descr[6].alias = pstrdup("g");
    descr[6].descr = pstrdup("root,词根");
    descr[7].lexid = 104;
    descr[7].alias = pstrdup("h");
    descr[7].descr = pstrdup("head,前连接成分");
    descr[8].lexid = 105;
    descr[8].alias = pstrdup("i");
    descr[8].descr = pstrdup("idiom,成语");
    descr[9].lexid = 106;
    descr[9].alias = pstrdup("j");
    descr[9].descr = pstrdup("abbreviation,简称");
    descr[10].lexid = 107;
    descr[10].alias = pstrdup("k");
    descr[10].descr = pstrdup("tail,后连接成分");
    descr[11].lexid = 108;
    descr[11].alias = pstrdup("l");
    descr[11].descr = pstrdup("tmp,习用语");
    descr[12].lexid = 109;
    descr[12].alias = pstrdup("m");
    descr[12].descr = pstrdup("numeral,数词");
    descr[13].lexid = 110;
    descr[13].alias = pstrdup("n");
    descr[13].descr = pstrdup("noun,名词");
    descr[14].lexid = 111;
    descr[14].alias = pstrdup("o");
    descr[14].descr = pstrdup("onomatopoeia,拟声词");
    descr[15].lexid = 112;
    descr[15].alias = pstrdup("p");
    descr[15].descr = pstrdup("prepositional,介词");
    descr[16].lexid = 113;
    descr[16].alias = pstrdup("q");
    descr[16].descr = pstrdup("quantity,量词");
    descr[17].lexid = 114;
    descr[17].alias = pstrdup("r");
    descr[17].descr = pstrdup("pronoun,代词");
    descr[18].lexid = 115;
    descr[18].alias = pstrdup("s");
    descr[18].descr = pstrdup("space,处所词");
    descr[19].lexid = 116;
    descr[19].alias = pstrdup("t");
    descr[19].descr = pstrdup("time,时语素");
    descr[20].lexid = 117;
    descr[20].alias = pstrdup("u");
    descr[20].descr = pstrdup("auxiliary,助词");
    descr[21].lexid = 118;
    descr[21].alias = pstrdup("v");
    descr[21].descr = pstrdup("verb,动词");
    descr[22].lexid = 119;
    descr[22].alias = pstrdup("w");
    descr[22].descr = pstrdup("punctuation,标点符号");
    descr[23].lexid = 120;
    descr[23].alias = pstrdup("x");
    descr[23].descr = pstrdup("unknown,未知词");
    descr[24].lexid = 121;
    descr[24].alias = pstrdup("y");
    descr[24].descr = pstrdup("modal,语气词");
    descr[25].lexid = 122;
    descr[25].alias = pstrdup("z");
    descr[25].descr = pstrdup("status,状态词");
    descr[26].lexid = 0;
}
//TODO :headline function
